%{

/*
** CSE 322, Lex/Yacc SKELETON: 
**
** ratml.l: "RATML" formatter, lex half
**
** see also ratml.h, ratml.y
**
** W. L. Ruzzo
**
*/

/*
** STUDENTS:
**
**   A: Add your name, Student ID number below:
**
**      Name: Kellen Donohue
**      Number: 0842001
**
**   B: You're FREE to change anything, but you don't NEED to 
**      change anything except the lex rules section.
*/

/* 
** Notes on the lexer/parser interface:
**
** 1. All lex tokens use the "str" case of the YYSTYPE union
** 2. yylval.str is set to NULL on all but the TEXT token type
** 3. the TEXT token type passes (through yylval) a non-NULL 
**    value, namely a pointer to a malloc'd COPY of the text string.
** 4. Token names are defined in the included file y.tab.h, 
**    which is generated by yacc
*/
%}

%{

/*
** include node typedefs, and token codes
*/
#include "ratml.h"
#include "y.tab.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stddef.h>

extern int yydebug;

/* function prototypes */
void toparser(const char str[]);
char *strdupl(const char *str);
int lookup(const char *str);

typedef struct{
    char *str;
    int code;
} tagtbl_t;

/* 
** table of recognized ratml tags, and associated 
** token codes.  Obviously, codes must agree with those 
** in html.y (and hence y.tab.h). 
**
** Students: you don't need to use this, or the 
** associated lookup routine, but it might be convenient.
** E.g., use lookup + one generic pattern for tags
** rather than many specific patterns.
*/

// for case insensitive comparison
int strcmp_nc(const char *s1, const char *s2)
{
  while (toupper(*s1) == toupper(*s2))
  {
    if (*s1 == 0)
      return 0;
    s1++;
    s2++;
  }
  return toupper(*(unsigned const char *)s1) - toupper(*(unsigned const char *)(s2));
}


tagtbl_t tagtbl[] = {
    {"<HTML>",    BHTML},
    {"<HEAD>",    BHEAD},
    {"<TITLE>",   BTITLE},
    {"<BODY>",    BBODY},
    {"<UL>",      BUL},
    {"</HTML>",   EHTML},
    {"</HEAD>",   EHEAD},
    {"</TITLE>",  ETITLE},
    {"</BODY>",   EBODY},
    {"</UL>",     EUL},
    {"<P>",       P},
    {"<BR>",      BR},
    {"<LI>",      LI},
	{"<H1>",	  BH1},
	{"</H1>",	  EH1},
	{"<OL>",	  BOL},
	{"</OL>",	  EOL},
    {NULL,        -1}         /* end-of-list indicator */
  };

%}

%%

%{

/*
** Students: please have debug OFF when you turn in, but might be useful earlier.
*/
#ifndef YYDEBUG
#define YYDEBUG	0	/* default: debugging off */
#endif
#if YYDEBUG
    yydebug = 1;
#else
    yydebug = 0;
#endif

%} /***  Lex rules below  ***  Students: definitely change them.  ***/
 
[ \t\n\r]+                 ; /* skip whitespace                       */
		
[0-9a-zA-Z\.!\?=*\-,;/\'\(\)\":\+\>]+              |
.                        { /* text words & punctation to parser, this was extended     */
                           toparser(yytext); return(TEXT);
                         }						  
"&amp;"					{toparser("&"); return (TEXT); }
"&gt;"					{toparser(">"); return (TEXT); }
"&lt;"					{toparser("<"); return (TEXT); }
"<!--"([^\-]|-[^\-]|--[^\>])*"-->"	{toparser(NULL); /* Don't send to parser */} /* If there are two comments all the text between could be globbed. To avoid
this we want to exclude possible combinations of inner closing comments, but
match everything else */
"<P>"                    { /* return paragraph token code, no text. */
                           toparser(NULL); return(P); 
                         }
						 /* These return yytext in case of an invalid tag */
"<"[a-zA-Z0-9]+">"			{toparser(NULL); return(lookup(yytext)); }
"</"[a-zA-Z0-9]+">"		{toparser(NULL); return(lookup(yytext)); }

%% /* ------------------------------------------------------------- */

/*
**
** A few useful utility routines
**
*/

/*************************************************************
** override yacc's default error routine with one that at least 
** reports the most recent input token.
*/
yyerror(const char *msg) {
    printf("Parse Error: %s. last token: '%s'\n", msg, yytext);
}

/*************************************************************
** lex calls yywrap() at EOF; return 1 signals no more files.
*/
int yywrap(){return 1;} 

/************************************************************* 
** strdupl : duplicate a string into malloc'd memory;
** return a pointer to it.
*/
char *strdupl(const char *str) {
    if (str == NULL) return NULL;
    return strcpy((char *)malloc(1+strlen(str)), str);
}

/* ************************************************************ 
** pass string to parser;  yacc expects token value (if any) 
** in yylval.  %union and %token declarations in html.y 
** determine the type of yylval, and that all tokens use the 
** "str" case thereof.
**
** It's important to duplicate the string, since in most cases 
** the original is in yytext, which will be overwritten as lex 
** reads more input.
*/
void toparser(const char str[]) {
    yylval.str = strdupl(str);
}

/************************************************************* 
** lookup string in table of recognized tags.  if found, 
** return associated token code. if not found, it's a TEXT 
** token.  Binary search would be faster...
*/
int lookup(const char *str) {
    int i;

    for( i=0; NULL != (tagtbl[i].str); i++) {
        if (0 == strcmp_nc(str, tagtbl[i].str)) {
            return tagtbl[i].code;
        }
    }
    return TEXT;
}
